In [1]:
classpath.add(
"org.apache.spark" %% "spark-core" % "2.0.2",
"org.apache.spark" %% "spark-sql" % "2.0.2",
"org.apache.spark" %% "spark-mllib" % "2.0.2"
);
In [2]:
import org.apache.spark.sql.{SparkSession, DataFrame, Dataset}
In [3]:
val spark = SparkSession.builder().master("local[*]").getOrCreate()
In [4]:
import spark.implicits._
In [6]:
val data = spark.createDataset(Seq(1, 2, 3, 4, 5)).map(_.toDouble)
Implicit aggregations exist on RDD
s
In [7]:
data.rdd.mean()
In [8]:
data.rdd.stdev()
But not on Dataset[Double]
In [8]:
data.mean()
In [14]:
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{mean, stddev, sum}
Here's one way to to this directly with Dataset
In [15]:
data.agg(mean(data("value"))).as[Double].collect().head
Hideous, right? Let's make this a bit more generic
In [16]:
def applyFunctionToDatasetOfDouble(data: Dataset[Double], function: (Column => Column)) = {
data.agg(function(data("value"))).as[Double].collect().head
}
In [17]:
applyFunctionToDatasetOfDouble(data, mean)
Apparently stddev in sql.functions
implements sample standard deviation, unlike the RDD
case
In [19]:
applyFunctionToDatasetOfDouble(data, stddev)
Is this worth it? Is the conversion from Dataset
to RDD
expensive?